# BiocManager::install("BiocFileCache")
library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
raw.path <- bfcrpath(bfc, file.path("http://cf.10xgenomics.com/samples",
"cell-exp/2.1.0/pbmc4k/pbmc4k_raw_gene_bc_matrices.tar.gz"))
untar(raw.path, exdir=file.path(tempdir(), "pbmc4k"))
# BiocManager::install("DropletUtils")
Here, we will do some preparation of the dataset by:
SingleCellExperiment is [Description]
library(DropletUtils)
fname <- file.path(tempdir(), "pbmc4k/raw_gene_bc_matrices/GRCh38")
sce.pbmc <- read10xCounts(fname, col.names=TRUE)
Sets the with SYMBOL names rather than the Ensembl Gene ID thwy start as, and add mapping information
# BiocManager::install("scater")
library(scater)
rownames(sce.pbmc) <- uniquifyFeatureNames(rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)
# BiocManager::install("EnsDb.Hsapiens.v86")
library(EnsDb.Hsapiens.v86)
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID,
column="SEQNAME", keytype="GENEID")
[Description]
set.seed(100)
e.out <- emptyDrops(counts(sce.pbmc))
sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]
# Calculate QC statistics
sce.pbmc <- calculateQCMetrics(sce.pbmc, feature_controls=list(Mito=which(location=="MT")))
# Determine outliers for total reads percent.mito per cell (makes a logical the length of our #cells)
high.mito <- isOutlier(sce.pbmc$pct_counts_Mito, nmads=3, type="higher")
# Remove cells with high mitochondrial read percentages
sce.pbmc <- sce.pbmc[,!high.mito]
We arrive at 3922 quality cells.
# BiocManager::install("scran")
library(scran)
set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, min.mean=0.1, cluster=clusters)
sce.pbmc <- normalize(sce.pbmc)
First, select features.
fit.pbmc <- trendVar(sce.pbmc, use.spikes=FALSE)
dec.pbmc <- decomposeVar(fit=fit.pbmc)
o <- order(dec.pbmc$bio, decreasing=TRUE)
chosen.hvgs <- rownames(dec.pbmc)[head(o, 2000)]
Then, run dimensionality reductions.
set.seed(10000)
sce.pbmc <- runPCA(sce.pbmc, feature_set=chosen.hvgs, ncomponents=25,
BSPARAM=BiocSingular::IrlbaParam())
set.seed(100000)
# BiocManager::install("Rtsne")
# install.packages("Rtsne")
sce.pbmc <- runTSNE(sce.pbmc, use_dimred="PCA")
set.seed(1000000)
# BiocManager::install("uwot")
# install.packages("uwot")
sce.pbmc <- runUMAP(sce.pbmc, use_dimred="PCA")
g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
sce.pbmc$cluster <- factor(clust)
Before we start, what do we have?
library(DittoSeq)
DEFAULT <- "sce.pbmc"
DBDimPlot("cluster", do.letter = FALSE)
First, we need to load in reference datasets. This sis human data, and there are two included in the SingleR package that we can use. We can use the code below to download (and cache them for future use).
library(SingleR)
blueprint.encode <- BlueprintEncodeData()
hpca <- HumanPrimaryCellAtlasData()
Now we can compare our cells to the references
#Run SingleR
singler.results.fine <- SingleR(test=sce.pbmc, ref=blueprint.encode, labels=blueprint.encode$label.fine, genes="de")
singler.results.main <- SingleR(test=sce.pbmc, ref=blueprint.encode, labels=blueprint.encode$label.main, genes="de")
Now if we want to check the scoring of an individual cell, we can compare its transcriptome to that of the database. The first cell was labeled as Monocytes (singler.results.fine$labels[1]).
keep <- !pruneScores(scores = singler.results.fine$scores, min.diff = 0.1, nmads = 3)
DBDimPlot(singler.results.fine$labels, do.letter = FALSE)
DBDimPlot(singler.results.fine$labels, do.letter = FALSE,
cells.use = keep,
colors = c(1:24)[levels(as.factor(singler.results.fine$labels)) %in%
levels(as.factor(singler.results.fine$labels[keep]))])
plotScoreHeatmap(results = singler.results.fine, max.labels = 10,
clusters = singler.results.fine$labels, show.pruned = TRUE,
prune.calls = as.character(!keep))
keep <- !pruneScores(scores = singler.results.main$scores, min.diff = 0.1, nmads = 3)
DBDimPlot(singler.results.main$labels, do.letter = FALSE)
DBDimPlot(singler.results.main$labels, do.letter = FALSE,
cells.use = keep,
colors = c(1:24)[levels(as.factor(singler.results.main$labels)) %in%
levels(as.factor(singler.results.main$labels[keep]))])
plotScoreHeatmap(results = singler.results.main, max.labels = 10,
clusters = singler.results.main$labels, show.pruned = TRUE,
prune.calls = as.character(!keep))
# Dan's scoring mod
pruneScores <- function(scores, min.diff.vs.median = 0.05, nmads=3, min.percent.diff.vs.next = 0.05) {
if (is(scores, "DataFrame")){
scores <- scores$scores
}
maxed <- rowMaxs(DelayedArray(scores))
delta <- maxed - rowMedians(scores)
keep <- delta >= min.diff.vs.median
keep <- keep & (rowSums(scores/maxed > (1-min.percent.diff.vs.next)) < 2)
best <- max.col(scores)
by.label <- split(which(keep), best[keep])
for (l in by.label) {
current <- maxed[l]
med <- median(current)
MAD <- mad(current, center=med)
keep[l] <- (current >= med - nmads * MAD)
}
!keep
}
keep <- !pruneScores(scores = singler.results.fine$scores, min.diff.vs.median = 0.1, min.percent.diff.vs.next = 0.01, nmads = 3)
DBDimPlot(singler.results.fine$labels, do.letter = FALSE)
DBDimPlot(singler.results.fine$labels, do.letter = FALSE,
cells.use = keep,
colors = c(1:24)[levels(as.factor(singler.results.fine$labels)) %in%
levels(as.factor(singler.results.fine$labels[keep]))])
plotScoreHeatmap(results = singler.results.fine, max.labels = 10,
clusters = singler.results.fine$labels, show.pruned = TRUE,
prune.calls = as.character(!keep))
keep <- !pruneScores(scores = singler.results.main$scores, min.diff.vs.median = 0.1, min.percent.diff.vs.next = 0.01, nmads = 3)
DBDimPlot(singler.results.main$labels, do.letter = FALSE)
DBDimPlot(singler.results.main$labels, do.letter = FALSE,
cells.use = keep,
colors = c(1:24)[levels(as.factor(singler.results.main$labels)) %in%
levels(as.factor(singler.results.main$labels[keep]))])
plotScoreHeatmap(results = singler.results.main, max.labels = 10,
clusters = singler.results.main$labels, show.pruned = TRUE,
prune.calls = as.character(!keep))